{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3",
   "language": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "source": [
    "# I - Consolidations of datasets"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "   timestamp_start  timestamp_end  qSQI_score  cSQI_score  sSQI_score  \\\n0         28800001       28809000        0.88        0.63        3.34   \n1         28809001       28818000        0.93        0.66        3.30   \n2         28818001       28827000        0.93        0.70        3.30   \n3         28827001       28836000        1.00        0.54        3.18   \n4         28836001       28845000        1.00        0.57        3.24   \n\n   kSQI_score  pSQI_score  basSQI_score  classif  classif_avg  patient  \n0       13.47        0.52          0.93        1          1.0   103001  \n1       13.21        0.52          0.93        1          1.0   103001  \n2       13.18        0.53          0.94        1          1.0   103001  \n3       12.14        0.52          0.92        1          1.0   103001  \n4       12.76        0.51          0.92        1          1.0   103001  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>timestamp_start</th>\n      <th>timestamp_end</th>\n      <th>qSQI_score</th>\n      <th>cSQI_score</th>\n      <th>sSQI_score</th>\n      <th>kSQI_score</th>\n      <th>pSQI_score</th>\n      <th>basSQI_score</th>\n      <th>classif</th>\n      <th>classif_avg</th>\n      <th>patient</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>28800001</td>\n      <td>28809000</td>\n      <td>0.88</td>\n      <td>0.63</td>\n      <td>3.34</td>\n      <td>13.47</td>\n      <td>0.52</td>\n      <td>0.93</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>103001</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>28809001</td>\n      <td>28818000</td>\n      <td>0.93</td>\n      <td>0.66</td>\n      <td>3.30</td>\n      <td>13.21</td>\n      <td>0.52</td>\n      <td>0.93</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>103001</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>28818001</td>\n      <td>28827000</td>\n      <td>0.93</td>\n      <td>0.70</td>\n      <td>3.30</td>\n      <td>13.18</td>\n      <td>0.53</td>\n      <td>0.94</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>103001</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>28827001</td>\n      <td>28836000</td>\n      <td>1.00</td>\n      <td>0.54</td>\n      <td>3.18</td>\n      <td>12.14</td>\n      <td>0.52</td>\n      <td>0.92</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>103001</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>28836001</td>\n      <td>28845000</td>\n      <td>1.00</td>\n      <td>0.57</td>\n      <td>3.24</td>\n      <td>12.76</td>\n      <td>0.51</td>\n      <td>0.92</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>103001</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "\nPatient count by classification:\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "         classif\npatient         \n103001       267\n111001     10071\n113001       400\n124001       533",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>classif</th>\n    </tr>\n    <tr>\n      <th>patient</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>103001</th>\n      <td>267</td>\n    </tr>\n    <tr>\n      <th>111001</th>\n      <td>10071</td>\n    </tr>\n    <tr>\n      <th>113001</th>\n      <td>400</td>\n    </tr>\n    <tr>\n      <th>124001</th>\n      <td>533</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "\nPatient average by classification:\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "          classif  classif_avg\npatient                       \n103001   0.561798     0.724159\n111001   0.348128     0.419475\n113001   0.522500     0.661358\n124001   0.110694     0.286481",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>classif</th>\n      <th>classif_avg</th>\n    </tr>\n    <tr>\n      <th>patient</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>103001</th>\n      <td>0.561798</td>\n      <td>0.724159</td>\n    </tr>\n    <tr>\n      <th>111001</th>\n      <td>0.348128</td>\n      <td>0.419475</td>\n    </tr>\n    <tr>\n      <th>113001</th>\n      <td>0.522500</td>\n      <td>0.661358</td>\n    </tr>\n    <tr>\n      <th>124001</th>\n      <td>0.110694</td>\n      <td>0.286481</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    }
   ],
   "source": [
    "patients = [103001, 111001, 113001, 124001]\n",
    "\n",
    "df_ml_conso = pd.read_csv('../../datasets/2_dataset_creation/df_ml_{}_norm.csv'.format(patients[0]))\n",
    "df_ml_conso['patient'] = patients[0]\n",
    "\n",
    "for patient in patients[1:]:\n",
    "    df_temp = pd.read_csv('../../datasets/2_dataset_creation/df_ml_{}_norm.csv'.format(patient))\n",
    "    df_temp['patient'] = patient\n",
    "    df_ml_conso = pd.concat([df_ml_conso,df_temp], axis=0)\n",
    "\n",
    "display(df_ml_conso.head())\n",
    "\n",
    "print('\\nPatient count by classification:')\n",
    "display(pd.pivot_table(df_ml_conso, values=['classif'], index='patient',aggfunc='count'))\n",
    "\n",
    "print('\\nPatient average by classification:')\n",
    "display(pd.pivot_table(df_ml_conso, values=['classif', 'classif_avg'], index='patient',aggfunc=np.mean))"
   ]
  },
  {
   "source": [
    "# II - Appying threshold of quality"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "\nPatient count:\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "         classif\npatient         \n103001       267\n111001     10071\n113001       400\n124001       533",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>classif</th>\n    </tr>\n    <tr>\n      <th>patient</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>103001</th>\n      <td>267</td>\n    </tr>\n    <tr>\n      <th>111001</th>\n      <td>10071</td>\n    </tr>\n    <tr>\n      <th>113001</th>\n      <td>400</td>\n    </tr>\n    <tr>\n      <th>124001</th>\n      <td>533</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "\nPatient average by classification:\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "          classif  classif_avg  classif_threshold\npatient                                          \n103001   0.561798     0.724159           0.565543\n111001   0.348128     0.419475           0.353490\n113001   0.522500     0.661358           0.535000\n124001   0.110694     0.286481           0.129456",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>classif</th>\n      <th>classif_avg</th>\n      <th>classif_threshold</th>\n    </tr>\n    <tr>\n      <th>patient</th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>103001</th>\n      <td>0.561798</td>\n      <td>0.724159</td>\n      <td>0.565543</td>\n    </tr>\n    <tr>\n      <th>111001</th>\n      <td>0.348128</td>\n      <td>0.419475</td>\n      <td>0.353490</td>\n    </tr>\n    <tr>\n      <th>113001</th>\n      <td>0.522500</td>\n      <td>0.661358</td>\n      <td>0.535000</td>\n    </tr>\n    <tr>\n      <th>124001</th>\n      <td>0.110694</td>\n      <td>0.286481</td>\n      <td>0.129456</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    }
   ],
   "source": [
    "# Setting a threshold : proportion of optimal by observation\n",
    "classif_threshold = 0.95\n",
    "\n",
    "df_ml_conso['classif_threshold'] = df_ml_conso['classif_avg'].apply(lambda x: 1 if x >= classif_threshold else 0)\n",
    "\n",
    "print('\\nPatient count:')\n",
    "display(pd.pivot_table(df_ml_conso, values=['classif'], index='patient',aggfunc='count'))\n",
    "\n",
    "print('\\nPatient average by classification:')\n",
    "display(pd.pivot_table(df_ml_conso, values=['classif', 'classif_avg', 'classif_threshold'], index='patient',aggfunc=np.mean))"
   ]
  },
  {
   "source": [
    "# III - Creation of a validation dataset"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_ml_conso_for_model, df_ml_conso_validation = train_test_split(df_ml_conso, test_size=0.2, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "   timestamp_start  timestamp_end  qSQI_score  cSQI_score  sSQI_score  \\\n0         28800001       28809000        0.88        0.63        3.34   \n1         28809001       28818000        0.93        0.66        3.30   \n2         28818001       28827000        0.93        0.70        3.30   \n3         28827001       28836000        1.00        0.54        3.18   \n4         28836001       28845000        1.00        0.57        3.24   \n\n   kSQI_score  pSQI_score  basSQI_score  classif  classif_avg  patient  \\\n0       13.47        0.52          0.93        1          1.0   103001   \n1       13.21        0.52          0.93        1          1.0   103001   \n2       13.18        0.53          0.94        1          1.0   103001   \n3       12.14        0.52          0.92        1          1.0   103001   \n4       12.76        0.51          0.92        1          1.0   103001   \n\n   classif_threshold  \n0                  1  \n1                  1  \n2                  1  \n3                  1  \n4                  1  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>timestamp_start</th>\n      <th>timestamp_end</th>\n      <th>qSQI_score</th>\n      <th>cSQI_score</th>\n      <th>sSQI_score</th>\n      <th>kSQI_score</th>\n      <th>pSQI_score</th>\n      <th>basSQI_score</th>\n      <th>classif</th>\n      <th>classif_avg</th>\n      <th>patient</th>\n      <th>classif_threshold</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>28800001</td>\n      <td>28809000</td>\n      <td>0.88</td>\n      <td>0.63</td>\n      <td>3.34</td>\n      <td>13.47</td>\n      <td>0.52</td>\n      <td>0.93</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>103001</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>28809001</td>\n      <td>28818000</td>\n      <td>0.93</td>\n      <td>0.66</td>\n      <td>3.30</td>\n      <td>13.21</td>\n      <td>0.52</td>\n      <td>0.93</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>103001</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>28818001</td>\n      <td>28827000</td>\n      <td>0.93</td>\n      <td>0.70</td>\n      <td>3.30</td>\n      <td>13.18</td>\n      <td>0.53</td>\n      <td>0.94</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>103001</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>28827001</td>\n      <td>28836000</td>\n      <td>1.00</td>\n      <td>0.54</td>\n      <td>3.18</td>\n      <td>12.14</td>\n      <td>0.52</td>\n      <td>0.92</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>103001</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>28836001</td>\n      <td>28845000</td>\n      <td>1.00</td>\n      <td>0.57</td>\n      <td>3.24</td>\n      <td>12.76</td>\n      <td>0.51</td>\n      <td>0.92</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>103001</td>\n      <td>1</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Shape : (11271, 12)\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "      timestamp_start  timestamp_end  qSQI_score  cSQI_score  sSQI_score  \\\n8211         73901129       73910128        0.67        0.54        5.15   \n6080         54721972       54730971        0.92        0.62        3.01   \n7109         63983046       63992045        0.91        0.66        3.89   \n467          65703457       65712457        0.55        0.57        0.09   \n8636         77726274       77735273        0.76        0.59        0.54   \n\n      kSQI_score  pSQI_score  basSQI_score  classif  classif_avg  patient  \\\n8211       35.58        0.48          0.93        1          1.0   111001   \n6080       15.50        0.53          0.85        1          1.0   111001   \n7109       22.35        0.57          0.87        1          1.0   111001   \n467         3.30        0.47          0.91        0          0.0   124001   \n8636       -0.44        0.51          0.72        0          0.0   111001   \n\n      classif_threshold  \n8211                  1  \n6080                  1  \n7109                  1  \n467                   0  \n8636                  0  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>timestamp_start</th>\n      <th>timestamp_end</th>\n      <th>qSQI_score</th>\n      <th>cSQI_score</th>\n      <th>sSQI_score</th>\n      <th>kSQI_score</th>\n      <th>pSQI_score</th>\n      <th>basSQI_score</th>\n      <th>classif</th>\n      <th>classif_avg</th>\n      <th>patient</th>\n      <th>classif_threshold</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>8211</th>\n      <td>73901129</td>\n      <td>73910128</td>\n      <td>0.67</td>\n      <td>0.54</td>\n      <td>5.15</td>\n      <td>35.58</td>\n      <td>0.48</td>\n      <td>0.93</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>111001</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>6080</th>\n      <td>54721972</td>\n      <td>54730971</td>\n      <td>0.92</td>\n      <td>0.62</td>\n      <td>3.01</td>\n      <td>15.50</td>\n      <td>0.53</td>\n      <td>0.85</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>111001</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>7109</th>\n      <td>63983046</td>\n      <td>63992045</td>\n      <td>0.91</td>\n      <td>0.66</td>\n      <td>3.89</td>\n      <td>22.35</td>\n      <td>0.57</td>\n      <td>0.87</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>111001</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>467</th>\n      <td>65703457</td>\n      <td>65712457</td>\n      <td>0.55</td>\n      <td>0.57</td>\n      <td>0.09</td>\n      <td>3.30</td>\n      <td>0.47</td>\n      <td>0.91</td>\n      <td>0</td>\n      <td>0.0</td>\n      <td>124001</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>8636</th>\n      <td>77726274</td>\n      <td>77735273</td>\n      <td>0.76</td>\n      <td>0.59</td>\n      <td>0.54</td>\n      <td>-0.44</td>\n      <td>0.51</td>\n      <td>0.72</td>\n      <td>0</td>\n      <td>0.0</td>\n      <td>111001</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Shape : (9016, 12)\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "      timestamp_start  timestamp_end  qSQI_score  cSQI_score  sSQI_score  \\\n8211         73901129       73910128        0.67        0.54        5.15   \n6080         54721972       54730971        0.92        0.62        3.01   \n7109         63983046       63992045        0.91        0.66        3.89   \n467          65703457       65712457        0.55        0.57        0.09   \n8636         77726274       77735273        0.76        0.59        0.54   \n\n      kSQI_score  pSQI_score  basSQI_score  classification  \n8211       35.58        0.48          0.93               1  \n6080       15.50        0.53          0.85               1  \n7109       22.35        0.57          0.87               1  \n467         3.30        0.47          0.91               0  \n8636       -0.44        0.51          0.72               0  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>timestamp_start</th>\n      <th>timestamp_end</th>\n      <th>qSQI_score</th>\n      <th>cSQI_score</th>\n      <th>sSQI_score</th>\n      <th>kSQI_score</th>\n      <th>pSQI_score</th>\n      <th>basSQI_score</th>\n      <th>classification</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>8211</th>\n      <td>73901129</td>\n      <td>73910128</td>\n      <td>0.67</td>\n      <td>0.54</td>\n      <td>5.15</td>\n      <td>35.58</td>\n      <td>0.48</td>\n      <td>0.93</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>6080</th>\n      <td>54721972</td>\n      <td>54730971</td>\n      <td>0.92</td>\n      <td>0.62</td>\n      <td>3.01</td>\n      <td>15.50</td>\n      <td>0.53</td>\n      <td>0.85</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>7109</th>\n      <td>63983046</td>\n      <td>63992045</td>\n      <td>0.91</td>\n      <td>0.66</td>\n      <td>3.89</td>\n      <td>22.35</td>\n      <td>0.57</td>\n      <td>0.87</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>467</th>\n      <td>65703457</td>\n      <td>65712457</td>\n      <td>0.55</td>\n      <td>0.57</td>\n      <td>0.09</td>\n      <td>3.30</td>\n      <td>0.47</td>\n      <td>0.91</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>8636</th>\n      <td>77726274</td>\n      <td>77735273</td>\n      <td>0.76</td>\n      <td>0.59</td>\n      <td>0.54</td>\n      <td>-0.44</td>\n      <td>0.51</td>\n      <td>0.72</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Shape : (9016, 9)\n"
     ]
    }
   ],
   "source": [
    "df_ml_conso_validation = df_ml_conso_for_model.drop(['patient', 'classif', 'classif_avg'], axis=1)\n",
    "df_ml_conso_validation.rename(columns={df_ml_conso_validation.columns[-1]: \"classification\"}, inplace=True)\n",
    "\n",
    "for element in  [df_ml_conso, df_ml_conso_for_model, df_ml_conso_validation]:\n",
    "    display(element.head())\n",
    "    print('Shape : {}'.format(element.shape))\n",
    "\n",
    "df_ml_conso_validation.to_csv('../../datasets/3_ml_patients_consolidation/df_ml_conso_validation_norm.csv', index=False)"
   ]
  },
  {
   "source": [
    "# IV - Equalisation of repartition by patient"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "\nPatient count:\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "         classif\npatient         \n103001       192\n111001      5710\n113001       308\n124001       114",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>classif</th>\n    </tr>\n    <tr>\n      <th>patient</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>103001</th>\n      <td>192</td>\n    </tr>\n    <tr>\n      <th>111001</th>\n      <td>5710</td>\n    </tr>\n    <tr>\n      <th>113001</th>\n      <td>308</td>\n    </tr>\n    <tr>\n      <th>124001</th>\n      <td>114</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "\nPatient average by classification:\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "          classif  classif_avg  classif_threshold\npatient                                          \n103001   0.494792     0.696354                0.5\n111001   0.491243     0.552439                0.5\n113001   0.490260     0.640614                0.5\n124001   0.421053     0.584996                0.5",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>classif</th>\n      <th>classif_avg</th>\n      <th>classif_threshold</th>\n    </tr>\n    <tr>\n      <th>patient</th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>103001</th>\n      <td>0.494792</td>\n      <td>0.696354</td>\n      <td>0.5</td>\n    </tr>\n    <tr>\n      <th>111001</th>\n      <td>0.491243</td>\n      <td>0.552439</td>\n      <td>0.5</td>\n    </tr>\n    <tr>\n      <th>113001</th>\n      <td>0.490260</td>\n      <td>0.640614</td>\n      <td>0.5</td>\n    </tr>\n    <tr>\n      <th>124001</th>\n      <td>0.421053</td>\n      <td>0.584996</td>\n      <td>0.5</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "\nProportion of each class\n1    3162\n0    3162\nName: classif_threshold, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "df_ml_conso_balanced = pd.DataFrame()\n",
    "\n",
    "for patient in patients:\n",
    "    df_class1 = df_ml_conso_for_model[(df_ml_conso_for_model['patient'] == patient) & (df_ml_conso_for_model['classif_threshold'] ==1)]\n",
    "    df_class0 = df_ml_conso_for_model[(df_ml_conso_for_model['patient'] == patient) & (df_ml_conso_for_model['classif_threshold'] ==0)]\n",
    "\n",
    "    if df_class1.shape[0] >= df_class0.shape[0]:\n",
    "        df_ml_conso_balanced = pd.concat([df_ml_conso_balanced,\n",
    "                              df_class0,\n",
    "                              df_class1.sample(df_class0.shape[0])]\n",
    "                            )\n",
    "    else:\n",
    "        df_ml_conso_balanced = pd.concat([df_ml_conso_balanced,\n",
    "                        df_class0.sample(df_class1.shape[0]),\n",
    "                        df_class1]\n",
    "                    )\n",
    "\n",
    "print('\\nPatient count:')\n",
    "display(pd.pivot_table(df_ml_conso_balanced, values=['classif'], index='patient',aggfunc='count'))\n",
    "\n",
    "print('\\nPatient average by classification:')\n",
    "display(pd.pivot_table(df_ml_conso_balanced, values=['classif', 'classif_avg', 'classif_threshold'], index='patient',aggfunc=np.mean))\n",
    "\n",
    "print('\\nProportion of each class')\n",
    "print(df_ml_conso_balanced['classif_threshold'].value_counts())"
   ]
  },
  {
   "source": [
    "# V - Export"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "     timestamp_start  timestamp_end  qSQI_score  cSQI_score  sSQI_score  \\\n252         58668179       58677178        0.64        0.57        2.71   \n227         58443149       58452148        0.72        0.62        3.55   \n157         57813056       57822061        0.95        0.57        2.59   \n20          28980002       28989002        0.85        0.59        2.97   \n221         58389147       58398147        0.92        0.64        3.93   \n\n     kSQI_score  pSQI_score  basSQI_score  classification  \n252        9.90        0.52          0.95               0  \n227       15.64        0.51          0.96               0  \n157       11.31        0.51          0.88               0  \n20        10.79        0.51          0.94               0  \n221       17.05        0.53          0.92               0  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>timestamp_start</th>\n      <th>timestamp_end</th>\n      <th>qSQI_score</th>\n      <th>cSQI_score</th>\n      <th>sSQI_score</th>\n      <th>kSQI_score</th>\n      <th>pSQI_score</th>\n      <th>basSQI_score</th>\n      <th>classification</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>252</th>\n      <td>58668179</td>\n      <td>58677178</td>\n      <td>0.64</td>\n      <td>0.57</td>\n      <td>2.71</td>\n      <td>9.90</td>\n      <td>0.52</td>\n      <td>0.95</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>227</th>\n      <td>58443149</td>\n      <td>58452148</td>\n      <td>0.72</td>\n      <td>0.62</td>\n      <td>3.55</td>\n      <td>15.64</td>\n      <td>0.51</td>\n      <td>0.96</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>157</th>\n      <td>57813056</td>\n      <td>57822061</td>\n      <td>0.95</td>\n      <td>0.57</td>\n      <td>2.59</td>\n      <td>11.31</td>\n      <td>0.51</td>\n      <td>0.88</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>20</th>\n      <td>28980002</td>\n      <td>28989002</td>\n      <td>0.85</td>\n      <td>0.59</td>\n      <td>2.97</td>\n      <td>10.79</td>\n      <td>0.51</td>\n      <td>0.94</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>221</th>\n      <td>58389147</td>\n      <td>58398147</td>\n      <td>0.92</td>\n      <td>0.64</td>\n      <td>3.93</td>\n      <td>17.05</td>\n      <td>0.53</td>\n      <td>0.92</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    }
   ],
   "source": [
    "df_ml_conso_balanced = df_ml_conso_balanced.drop(['patient', 'classif', 'classif_avg'], axis=1)\n",
    "df_ml_conso_balanced.rename(columns={df_ml_conso_balanced.columns[-1]: \"classification\"}, inplace=True)\n",
    "display(df_ml_conso_balanced.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_ml_conso_balanced.to_csv('../../datasets/3_ml_patients_consolidation/df_ml_conso_balanced_norm.csv', index=False)"
   ]
  }
 ]
}